Species Distribution Modeling

Part 1: Explore

Install Packages

# load packages, installing if missing
if (!require(librarian)){
  install.packages("librarian")
  library(librarian)
}
librarian::shelf(
  dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = FALSE)
# set random seed for reproducibility
set.seed(42)
# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = T)

Get Species Observations

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo    <- FALSE
if (!file.exists(obs_geo) | redo){
  # get species occurrence data from GBIF with coordinates
  (res <- spocc::occ(
    query = 'Haliaeetus leucocephalus', 
    from = 'gbif', has_coords = T, limit = 10000))
  
  # extract data frame from result
  df <- res$gbif$data[[1]] 
  readr::write_csv(df, obs_csv)
  
  # convert to points of observation from lon/lat columns in data frame
  obs <- df %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) # save space (joinable from obs_csv)
  sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 10000
# show points on map
mapview::mapview(obs, map.types = "Esri.WorldPhysical")

Get Environmental Data

Presence

dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)

# crop the environmental rasters to a reasonable study area around our species observations
obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite=T)  
}
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

Pseudo-Absence

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # show map
  # mapview(obs) + 
  #   mapview(r_obs)
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(
        present = 1) %>% 
      select(present, key),
    absence %>% 
      mutate(
        present = 0,
        key     = NA)) %>% 
    mutate(
      ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn=T)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(
      pts %>% 
        select(ID, present),
      by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(
      #present = factor(present),
      lon = st_coordinates(geometry)[,1],
      lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)

pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))
nrow(pts_env)
## [1] 20000
datatable(pts_env, rownames = F)

Term Plots

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))

## Pairs plot to show correlations between variables

GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))

# Part 2: Logistic Regression ## Setup Data - Drop rows with any NAs - remove terms we don’t want to model - use a simplified formula _present_ ~. to predict where the species is present based on all other fields in the data from (ie. y ~ X1 + X2 + …Xn)

d <- pts_env %>% 
  select(-ID) %>% # remove terms we don't want to model 
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19934

Linear Model

# fit a linear model
mdl_linear <- lm(present ~ ., data = d)
summary(mdl_linear)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28636 -0.33804  0.04561  0.35671  1.32955 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.762e-01  1.111e-01   3.388 0.000707 ***
## WC_alt       1.445e-04  1.195e-05  12.088  < 2e-16 ***
## WC_bio1      6.831e-02  2.230e-03  30.636  < 2e-16 ***
## WC_bio2     -6.595e-02  2.071e-03 -31.844  < 2e-16 ***
## ER_tri      -2.455e-03  1.724e-04 -14.238  < 2e-16 ***
## ER_topoWet  -5.597e-02  3.489e-03 -16.045  < 2e-16 ***
## lon          6.501e-03  2.897e-04  22.442  < 2e-16 ***
## lat          3.724e-02  1.976e-03  18.840  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4322 on 19926 degrees of freedom
## Multiple R-squared:  0.2532, Adjusted R-squared:  0.253 
## F-statistic: 965.3 on 7 and 19926 DF,  p-value: < 2.2e-16
y_predict <- predict(mdl_linear, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.329554  1.286362
range(y_true)
## [1] 0 1

Generalized Linear Model

# fit a generalized linear model with a binomial logit link function
mdl_glm <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl_glm)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7969  -0.8593  -0.1855   0.9010   2.9607  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.763e+00  5.985e-01  -2.945  0.00323 ** 
## WC_alt       7.552e-04  6.628e-05  11.394  < 2e-16 ***
## WC_bio1      3.629e-01  1.250e-02  29.038  < 2e-16 ***
## WC_bio2     -3.241e-01  1.199e-02 -27.020  < 2e-16 ***
## ER_tri      -1.305e-02  9.681e-04 -13.477  < 2e-16 ***
## ER_topoWet  -2.757e-01  1.888e-02 -14.602  < 2e-16 ***
## lon          3.354e-02  1.583e-03  21.183  < 2e-16 ***
## lat          2.077e-01  1.088e-02  19.087  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27634  on 19933  degrees of freedom
## Residual deviance: 21989  on 19926  degrees of freedom
## AIC: 22005
## 
## Number of Fisher Scoring iterations: 4
y_predict_glm <- predict(mdl_glm, d, type = "response")
range(y_predict_glm)
## [1] 0.01209068 0.97998789

Look at the terms plots to see the relationship between predictor and response

# show term plots
termplot(mdl_glm, partial.resid = TRUE, se = TRUE, main = F, ylim = "free")

## Generalize Additive Model With a general additive model we can add “wiggle” to the relationship between predictor and response by introducing smooth s() terms

librarian::shelf(mgcv)

# fit a generalized additive model with smooth predictors
mdl_gen_add <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat),
  family = binomial, data = d)
summary(mdl_gen_add)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.1347     0.0357  -3.772 0.000162 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df Chi.sq p-value    
## s(WC_alt)     8.762  8.979 490.73  <2e-16 ***
## s(WC_bio1)    7.950  8.455 349.27  <2e-16 ***
## s(WC_bio2)    8.767  8.980 462.99  <2e-16 ***
## s(ER_tri)     8.772  8.984 112.77  <2e-16 ***
## s(ER_topoWet) 8.454  8.894  73.74  <2e-16 ***
## s(lon)        7.486  8.449 163.18  <2e-16 ***
## s(lat)        8.872  8.992 229.68  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.441   Deviance explained = 38.8%
## UBRE = -0.14616  Scale est. = 1         n = 19934
# show term plot
plot(mdl_gen_add, scale=0)

Maxent (Maximum Entropy)

# load extra packages
librarian::shelf(
  maptools, sf)

mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")

# show version of maxent
if (!interactive())
  maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack

# env_stack_grd <- file.path(dir_data, "env_stack.grd")
# env_stack <- stack(env_stack_grd)
# plot(env_stack, nc=2)
# get the presence-only observation points (maxent extracts raster values for you)
# obs_geo2<- file.path(dir_data, "obs")
obs_sp <- read_sf(obs_geo) %>%
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maxent entropy model
if (!file.exists(mdl_maxent_rds)){
  mdl_maxent <- maxent(env_stack, obs_sp)
  readr::write_rds(mdl_maxent, mdl_maxent_rds)
}
mdl_maxent <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl_maxent)

# plot term plots
response(mdl_maxent)

# predict
y_predict_maxent <- predict(env_stack, mdl_maxent) #, ext=ext, progress='')

plot(y_predict_maxent, main='Maxent, raw prediction')
data(wrld_simpl, package = "maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

# Part 3: Decision Trees Use decision trees as a classification technique to the data with the response being categorical (factor(present))

# load packages
librarian:::shelf(
  caret, # X: modeling framework
  dplyr, ggplot, here, reader, 
  pdp, # X: partial dependence plots
  rpart, # m: recursive partition modeling
  rpart.plot, # m: recusive partition plotting
  rsample, # d: split train/test data
  skimr, # d: skim summarize data table
  vip) # X: variable importance
# options
options(
  scipen = 999,
  readr.show_col_types = F)
set.seed(42)

# graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# paths
# dir_data    <- here("data/sdm")
# pts_env_csv <- file.path(dir_data, "pts_env.csv")

# read data
# pts_env <- read_csv(pts_env_csv)
d <- pts_env %>% 
  select(-ID) %>% # not used as a predictor x
  mutate(
    present = factor(present)) %>% 
  na.omit()
skim(d)
Data summary
Name d
Number of rows 19934
Number of columns 8
_______________________
Column type frequency:
factor 1
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 0: 9978, 1: 9956

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_alt 0 1 491.02 583.19 -68.00 86.00 262.00 660.00 3690.00 ▇▂▁▁▁
WC_bio1 0 1 8.03 6.87 -11.80 3.70 8.80 12.00 25.30 ▁▃▇▅▂
WC_bio2 0 1 11.83 2.60 4.10 10.20 11.60 13.20 20.30 ▁▅▇▂▁
ER_tri 0 1 25.61 35.19 0.00 4.94 11.17 31.13 261.15 ▇▁▁▁▁
ER_topoWet 0 1 11.48 1.72 6.48 10.29 11.77 12.77 15.17 ▁▃▆▇▂
lon 0 1 -102.45 21.58 -176.66 -121.19 -101.38 -83.46 -52.67 ▁▁▇▇▃
lat 0 1 44.46 8.83 24.34 38.79 44.46 49.21 67.58 ▂▆▇▂▂

Split data into training and testing

# create training set with 80% of full data
d_split <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train <- rsample::training(d_split)

# show number of rows present in 0 vs 1
table(d$present)
## 
##    0    1 
## 9978 9956
table(d_train$present)
## 
##    0    1 
## 7982 7964

Partition, depth=1

# run decision stump model
mdl_stump <- rpart(
  present ~ ., data = d_train,
  control = list(
    cp = 0, minbucket = 5, maxdepth = 1))
mdl_stump
## n= 15946 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 15946 7964 0 (0.5005644 0.4994356)  
##   2) WC_bio1< 3.25 3728  459 0 (0.8768777 0.1231223) *
##   3) WC_bio1>=3.25 12218 4713 1 (0.3857423 0.6142577) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl_stump)

Partition, depth=default

# decision tree with defaults
mdl_default_tree <- rpart(present ~ ., data = d_train)
mdl_default_tree
## n= 15946 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 15946 7964 0 (0.5005644 0.4994356)  
##    2) WC_bio1< 3.25 3728  459 0 (0.8768777 0.1231223) *
##    3) WC_bio1>=3.25 12218 4713 1 (0.3857423 0.6142577)  
##      6) WC_bio2>=12.25 5271 1865 0 (0.6461772 0.3538228)  
##       12) lon>=-117.7967 4520 1380 0 (0.6946903 0.3053097) *
##       13) lon< -117.7967 751  266 1 (0.3541944 0.6458056) *
##      7) WC_bio2< 12.25 6947 1307 1 (0.1881388 0.8118612) *
rpart.plot(mdl_default_tree)

# plot complexity parameter
plotcp(mdl_default_tree)

# rpart cross validation results
mdl_default_tree$cptable
##           CP nsplit rel error    xerror        xstd
## 1 0.35057760      0 1.0000000 1.0248619 0.007925790
## 2 0.19349573      1 0.6494224 0.6540683 0.007436379
## 3 0.02749874      2 0.4559267 0.4588147 0.006664053
## 4 0.01000000      3 0.4284279 0.4342039 0.006534341

Feature Interpretation

# caret cross validation results
mdl_caret <- train(
  present ~ .,
  data = d_train,
  method = "rpart",
  trControl = trainControl(method = "cv", number = 10),
  tuneLength = 20)

ggplot(mdl_caret)

vip(mdl_caret, num_features = 40, bar = FALSE)

# construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>%  autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>% 
  plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
              colorkey = TRUE, screen = list(z = -20, x = -60))
# display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

## Random Forests

# load additional packages
librarian::shelf(
  ranger) # random forest modeling

Fit

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.3241986

Feature Interpretation

# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
  present ~ ., data = d_train,
  importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
  present ~ ., data = d_train,
  importance = "permutation")
p1_rf <- vip::vip(mdl_impurity, bar = FALSE)
p2_rf <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1_rf, p2_rf, nrow = 1)